In [5]:
import IPython.core.display as di

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
#di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)
In [2]:
import numpy as np
import pandas as pd
import folium
from folium import plugins
from bokeh.io import output_notebook
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.plotting import figure, save

from bokeh.models import Legend, HoverTool, ColumnDataSource, Panel,HBar, Select,FactorRange,RadioGroup,Div
from bokeh.models.widgets import  Slider, RangeSlider, Tabs

from bokeh.layouts import column, row, WidgetBox

from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.io import output_notebook,show,output_file
from math import pi
import bokeh.palettes as c
output_notebook()
import warnings
warnings.filterwarnings('ignore')
Loading BokehJS ...

Motor Vehicle Collisions/Crashes in New York City

The plots below illustrate the dataset of Motor Vehicle Collisions/Crashes in New York City. The time frame on the data is from 2012 to 2020. We have chosen to plot four main perspective on the data; the geographic, the underlying causes of accidents, the type of vehicle and the temporal patterns.

In [4]:
data_raw = pd.read_csv('Motor_Vehicle_Collisions_-_Crashes.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode',low_memory = False)
In [5]:
injured_killed = list(['NUMBER OF PERSONS INJURED','NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED','NUMBER OF PEDESTRIANS KILLED',
                       'NUMBER OF CYCLIST INJURED','NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED','NUMBER OF MOTORIST KILLED'])
cont_factor = list(['CONTRIBUTING FACTOR VEHICLE 1','CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5'])
In [6]:
data_raw[injured_killed] = data_raw[injured_killed].fillna(0)
In [7]:
data_raw['NUMBER OF PERSONS INJURED'] = data_raw["NUMBER OF PERSONS INJURED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF PERSONS KILLED'] = data_raw["NUMBER OF PERSONS KILLED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF PEDESTRIANS INJURED'] = data_raw["NUMBER OF PEDESTRIANS INJURED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF PEDESTRIANS KILLED'] = data_raw["NUMBER OF PEDESTRIANS KILLED"].astype(str).astype(float).astype(int)

data_raw['NUMBER OF CYCLIST INJURED'] = data_raw["NUMBER OF CYCLIST INJURED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF CYCLIST KILLED'] = data_raw["NUMBER OF CYCLIST KILLED"].astype(str).astype(float).astype(int)

data_raw['NUMBER OF MOTORIST INJURED'] = data_raw["NUMBER OF MOTORIST INJURED"].astype(str).astype(float).astype(int)
data_raw['NUMBER OF MOTORIST KILLED'] = data_raw["NUMBER OF MOTORIST KILLED"].astype(str).astype(float).astype(int)
In [8]:
data_raw['CRASH_DATE_TIME'] = pd.to_datetime(data_raw['CRASH DATE'].str[0:10]+' '+data_raw['CRASH TIME'], format = '%m/%d/%Y %H:%M',infer_datetime_format=True)
data_raw['Year'] = data_raw['CRASH_DATE_TIME'].dt.year
data_raw['Hour'] = data_raw['CRASH_DATE_TIME'].dt.hour
data_raw['ON STREET NAME'] = data_raw['ON STREET NAME'].str.strip()
data_raw['OFF STREET NAME'] = data_raw['OFF STREET NAME'].str.strip()
def intersection(df):
    inter = str(str(df['ON STREET NAME'])+', '+str(df['CROSS STREET NAME']))
    return inter
data_raw['Intersection'] = data_raw.apply(intersection, axis = 1)
In [9]:
data_raw['VEHICLE TYPE CODE 1'] = data_raw['VEHICLE TYPE CODE 1'].str.title()
data_raw['VEHICLE TYPE CODE 2'] = data_raw['VEHICLE TYPE CODE 2'].str.title()
data_raw['VEHICLE TYPE CODE 3'] = data_raw['VEHICLE TYPE CODE 3'].str.title()
data_raw['VEHICLE TYPE CODE 4'] = data_raw['VEHICLE TYPE CODE 4'].str.title()
data_raw['VEHICLE TYPE CODE 5'] = data_raw['VEHICLE TYPE CODE 5'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 1'] = data_raw['CONTRIBUTING FACTOR VEHICLE 1'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 2'] = data_raw['CONTRIBUTING FACTOR VEHICLE 2'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 3'] = data_raw['CONTRIBUTING FACTOR VEHICLE 3'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 4'] = data_raw['CONTRIBUTING FACTOR VEHICLE 4'].str.title()
data_raw['CONTRIBUTING FACTOR VEHICLE 5'] = data_raw['CONTRIBUTING FACTOR VEHICLE 5'].str.title()
data_raw['Intersection'] = data_raw['Intersection'].str.title()
data_raw.replace('Station Wagon/Sport Utility Vehicle', 'Sport Utility / Station Wagon', inplace = True)
In [10]:
data_raw_filtered = data_raw[(data_raw['VEHICLE TYPE CODE 1'] != 'PASSENGER VEHICLE') & (data_raw['VEHICLE TYPE CODE 2'] != 'PASSENGER VEHICLE')
            & (data_raw['VEHICLE TYPE CODE 3'] != 'PASSENGER VEHICLE') & (data_raw['VEHICLE TYPE CODE 4'] != 'PASSENGER VEHICLE') & 
           (data_raw['VEHICLE TYPE CODE 5'] != 'PASSENGER VEHICLE')]

data_raw_filtered = data_raw[(data_raw['VEHICLE TYPE CODE 1'] != 'UNKNOWN') & (data_raw['VEHICLE TYPE CODE 2'] != 'UNKNOWN')
            & (data_raw['VEHICLE TYPE CODE 3'] != 'UNKNOWN') & (data_raw['VEHICLE TYPE CODE 4'] != 'UNKNOWN') & 
           (data_raw['VEHICLE TYPE CODE 5'] != 'UNKNOWN')]

data_raw_filtered = data_raw[(data_raw['VEHICLE TYPE CODE 1'] != 'OTHER') & (data_raw['VEHICLE TYPE CODE 2'] != 'OTHER')
            & (data_raw['VEHICLE TYPE CODE 3'] != 'OTHER') & (data_raw['VEHICLE TYPE CODE 4'] != 'OTHER') & 
           (data_raw['VEHICLE TYPE CODE 5'] != 'OTHER')]

data_raw_filtered = data_raw[(data_raw['CONTRIBUTING FACTOR VEHICLE 1'] != 'Unspecified') & (data_raw['CONTRIBUTING FACTOR VEHICLE 2'] != 'Unspecified')
           & (data_raw['CONTRIBUTING FACTOR VEHICLE 3'] != 'Unspecified') & (data_raw['CONTRIBUTING FACTOR VEHICLE 4'] != 'Unspecified') &
            (data_raw['CONTRIBUTING FACTOR VEHICLE 5'] != 'Unspecified')]
In [11]:
crossData = data_raw[['LATITUDE','NUMBER OF PERSONS INJURED','NUMBER OF PERSONS KILLED','NUMBER OF PEDESTRIANS INJURED','NUMBER OF PEDESTRIANS KILLED','LONGITUDE','NUMBER OF MOTORIST INJURED','NUMBER OF MOTORIST KILLED','ON STREET NAME', 'CROSS STREET NAME','Intersection']]
crossData=crossData[(crossData.LONGITUDE.isnull() == False) & ( crossData.LATITUDE.isnull()== False)]
crossInjured=crossData[['NUMBER OF PERSONS INJURED','LATITUDE','LONGITUDE','ON STREET NAME', 'CROSS STREET NAME','Intersection']]
crossInjured=crossInjured.dropna(axis=0)
In [12]:
coordinates=crossInjured[['Intersection', 'LATITUDE', 'LONGITUDE']]
coordNoDup=coordinates.drop_duplicates(subset='Intersection')
crossCount=crossInjured.drop(labels=['ON STREET NAME', 'CROSS STREET NAME','LATITUDE', 'LONGITUDE'], axis=1)
In [13]:
#For calculating the sum of injured people
crossSum=crossCount.groupby('Intersection').sum()
merged=coordNoDup.merge(crossSum,how='inner', on='Intersection')
filtInjured=merged.sort_values(by='NUMBER OF PERSONS INJURED',axis=0, ascending=False).iloc[0:10]
In [14]:
#Total number of accidents
crossCount=crossCount.groupby('Intersection').count()
mergedAcc=coordNoDup.merge(crossCount,on='Intersection')
filtAcc=mergedAcc.sort_values(by='NUMBER OF PERSONS INJURED',axis=0, ascending=False).iloc[0:10]
In [15]:
#Number of persons killed
crossKilled=crossData[['NUMBER OF PERSONS KILLED','LATITUDE','LONGITUDE','ON STREET NAME', 'CROSS STREET NAME','Intersection']]
crossKilled=crossKilled.dropna(axis=0)
crossKilled=crossKilled[crossKilled['NUMBER OF PERSONS KILLED']>0]
In [16]:
coordinatesKilled=crossKilled[['Intersection', 'LATITUDE', 'LONGITUDE']]
coordNoDupK=coordinatesKilled.drop_duplicates(subset='Intersection')
crossK=crossKilled.drop(labels=['ON STREET NAME', 'CROSS STREET NAME','LATITUDE', 'LONGITUDE'], axis=1)
crossSumK=crossK.groupby('Intersection').sum()
mergedK=coordNoDup.merge(crossSumK,how='inner', on='Intersection')
filtK=mergedK.sort_values(by='NUMBER OF PERSONS KILLED',axis=0, ascending=False).iloc[0:10]

Map plot of top ten intersections

The map plot below plots the top ten intersections involved in the categories Persons Injured, Persons killed and Accident locations. The last category involves both the accidents where people were injured or killed but also accidents without any injuries. The circles are clickable and shows the name of the intersection and the number of instances involved in that accident category. Bigger circles indicate more accidents at that location. In the upper right corner the user can filter between the categories and change map tile.

In [17]:
#TOP THEN INTERSECTIONS FOR ACCIDENTS, KILLED, INJURED
map_hooray = folium.Map(location=[40.730610, -73.935242],tiles='Stamen Watercolor',
                    zoom_start = 11)

folium.TileLayer('Stamen Toner').add_to(map_hooray)

g1 = folium.FeatureGroup(name='Top ten intersections for accidents')
map_hooray.add_child(g1)

g2 = folium.FeatureGroup(name='Top ten intersections for persons killed')
map_hooray.add_child(g2)


g3 = folium.FeatureGroup(name='Top ten intersections for persons injured')
map_hooray.add_child(g3)

colors=c.viridis(3)

sumK=filtK['NUMBER OF PERSONS KILLED'].sum(axis=0)
for x in range(0,len(filtK)):
    percentage=filtK['NUMBER OF PERSONS KILLED'].iloc[x]/sumK
    percentage=np.log10(filtK['NUMBER OF PERSONS KILLED'].iloc[x])/np.log10(sumK)
    
    
    intersection=str('\n'+str(filtK['Intersection'].iloc[x]))
    text1=str('Number of persons killed: ')
    text2=str(filtK['NUMBER OF PERSONS KILLED'].iloc[x])
    html="<b>Intersection:</b><p>{}</p> <b> {} </b><p>{}</p>".format(intersection,text1,text2)
    iframe = folium.IFrame(html=html, width=250, height=190)
    p = folium.Popup(iframe, max_width=2650)
    
    
    folium.CircleMarker([filtK.LATITUDE.iloc[x], filtK.LONGITUDE.iloc[x]],
                        radius=15*percentage,
                        popup=p,
                        color=colors[0], fill=True, opacity=0.3+percentage
                        ).add_to(g2)


sumA=filtAcc['NUMBER OF PERSONS INJURED'].sum(axis=0)
for x in range(0,len(filtAcc)):
    
    percentage=np.log10(filtAcc['NUMBER OF PERSONS INJURED'].iloc[x])/np.log10(sumA)
    
    intersection=str('\n'+str(filtAcc['Intersection'].iloc[x]))
    text1=str('Number of accidents: ')
    text2=str(filtAcc['NUMBER OF PERSONS INJURED'].iloc[x])
    html="<b>Intersection:</b><p>{}</p> <b> {} </b><p>{}</p>".format(intersection,text1,text2)
    iframe = folium.IFrame(html=html, width=250, height=190)
    p = folium.Popup(iframe, max_width=2650)

    
    folium.CircleMarker([filtAcc.LATITUDE.iloc[x], filtAcc.LONGITUDE.iloc[x]],
                        radius=15*percentage,
                        popup=p,
                        color=colors[1],fill=True, opacity=0.3+percentage
                        ).add_to(g1)

sumJ=filtInjured['NUMBER OF PERSONS INJURED'].sum(axis=0)
for x in range(0,len(filtInjured)):
    percentage=np.log10(filtAcc['NUMBER OF PERSONS INJURED'].iloc[x])/np.log10(sumJ)
    
    intersection=str('\n'+str(filtInjured['Intersection'].iloc[x]))
    text1=str('Number of persons injured: ')
    text2=str(filtInjured['NUMBER OF PERSONS INJURED'].iloc[x])
    html="<strong>Intersection:</strong><p>{}</p> <b> {} </b><p>{}</p>".format(intersection,text1,text2)
    iframe = folium.IFrame(html=html, width=250, height=190)
    p = folium.Popup(iframe, max_width=2650)
    

    folium.CircleMarker([filtInjured.LATITUDE.iloc[x], filtInjured.LONGITUDE.iloc[x]],
                        radius=15*percentage,
                        popup=p,
                        color=colors[2],fill=True, opacity=0.3+percentage
                        ).add_to(g3)

folium.LayerControl(collapsed=True).add_to(map_hooray)

map_hooray # Calls the map to display
Out[17]:
In [18]:
#VEHICLE TYPES
typeData=data_raw_filtered[['CRASH_DATE_TIME','LATITUDE','LONGITUDE', 'VEHICLE TYPE CODE 1','CONTRIBUTING FACTOR VEHICLE 1','ON STREET NAME', 'CROSS STREET NAME']]
typeData=typeData[(typeData.LONGITUDE.isnull() == False) & ( typeData.LATITUDE.isnull()== False)]
typeData=typeData.dropna(axis=0)

oneCar=typeData[['VEHICLE TYPE CODE 1','LATITUDE']]
oneCar=oneCar.groupby('VEHICLE TYPE CODE 1').count()
toptenCar=oneCar.sort_values(by='LATITUDE',axis=0, ascending=False).iloc[0:10]
namesCar=list(toptenCar.index)
In [19]:
sport=typeData[typeData['VEHICLE TYPE CODE 1']=='Sport Utility / Station Wagon']
sedan=typeData[typeData['VEHICLE TYPE CODE 1']=='Sedan']
taxi=typeData[typeData['VEHICLE TYPE CODE 1']=='Taxi']
bus=typeData[typeData['VEHICLE TYPE CODE 1']=='Bus']
#sport2=typeData[typeData['VEHICLE TYPE CODE 1']=='Station Wagon/Sport Utility Vehicle']
van=typeData[typeData['VEHICLE TYPE CODE 1']=='Van']
pickup=typeData[typeData['VEHICLE TYPE CODE 1'] == 'Pick-Up Truck']
livery=typeData[typeData['VEHICLE TYPE CODE 1'] == 'Livery Vehicle']
In [20]:
#sport_combined=pd.concat([sport.sample(500),sport2.sample(500)], axis=0)
carList=[sport.sample(1000),sedan.sample(1000),taxi.sample(1000),van.sample(1000), pickup.sample(1000),bus.sample(1000),livery.sample(1000)]

Map plot of most common vehicle types in accidents

The map plot below plots the accidents locations of a random sample from the most common vehicle types across all time in the dataset. The random sample size is 1000 instances for each vehicle type. The dots are clickable and show the date and time of the crash and also the vehicle type. In the upper right corner the user can filter between vehicle types and change the map tile.

In [21]:
m = folium.Map(location=[40.730610, -73.935242],tiles='Stamen Watercolor',
                    zoom_start = 11)

g1 = folium.FeatureGroup(name='Sport utility / Station wagon')
m.add_child(g1)

g2 = folium.FeatureGroup(name='Sedan')
m.add_child(g2)

g3 = folium.FeatureGroup(name='Taxi')
m.add_child(g3)

g4 = folium.FeatureGroup(name='Van')
m.add_child(g4)

g5 = folium.FeatureGroup(name='Pick-Up truck')
m.add_child(g5)

g6 = folium.FeatureGroup(name='Bus')
m.add_child(g6)

g7 = folium.FeatureGroup(name='Livery Vehicle')
m.add_child(g7)

group_list=[g1,g2,g3,g4,g5,g6,g7]

folium.TileLayer('Stamen Toner').add_to(m)
colors=c.viridis(len(group_list))


count=0;''
for car in carList:
    for x in range(0,len(car)):
        
        text1=str('\n'+str(car.CRASH_DATE_TIME.iloc[x]))
        text2=str('\n'+str(car['VEHICLE TYPE CODE 1'].iloc[x]))
        html="<strong>Date and time:</strong><p>{}</p> <b> Vehicle type: </b><p>{}<p>".format(text1,text2)
        iframe = folium.IFrame(html=html, width=250, height=190)
        p = folium.Popup(iframe, max_width=2650)

        folium.CircleMarker([car.LATITUDE.iloc[x],car.LONGITUDE.iloc[x]],
                        radius=2,
                        popup=p,
                        color=colors[count],fill=True, opacity=0.8
                        ).add_to(group_list[count])
    count +=1
    
folium.LayerControl(collapsed=True).add_to(m)
m
Out[21]:

Bar plot of Number of persons injured/killed related to Contributing Factore

The bar plot below plots the number of victims injured/ killed by the different contributing factors. The plot can be viewed in either linear or in log scale. Furthermore, the plot can also be viewed by ranging the slider to get the idea about which contributing factor is more responsible for injuring/killing victims. This plot may take a while to update and load, please be patient. Furthermore due to constraints in processing time in order to run the visualisation this plot contains 2/41 parts of the original dataset, this amounts 80 000 observation.

In [8]:
from IPython.display import IFrame
IFrame('https://bokehplot1.herokuapp.com/bokehplot1?fbclid=IwAR1kLSnSNQMCrIocbmxlrjzX2rEK04AN9W5q1uTGoV13NyxnrDg9EjaCdro', width=1100, height=600)
Out[8]:

Bar plot of Collisions per hour related to Vehicle type

This barplot shows the proportion of accidents during the day for the most common vehicle types. It is also possible to filter on the contributing causes for each vehicle in the collision. The labels "Vehicle type code #" and "Contributing factor vehicle #" addresses the different cars involved in an accident with number one being the primary car in the accident. This plot may take a while to update and load, please be patient. Furthermore due to constraints in processing time in order to run the visualisation this plot contains 2/41 parts of the original dataset, this amounts 80 000 observation.

In [10]:
from IPython.display import IFrame
IFrame('https://bokehplot2.herokuapp.com/bokehplot2?fbclid=IwAR0DeGz4HuLiWQMY2GF1pV-58PiBOye4H18PSMK9N1KUO4TgTkGOxhngCRw', width=950, height=600)
Out[10]:
In [ ]: